for graphic - plotly for data - pandas
# data
import pandas as pd #import library
import numpy as np
# graph
import plotly.express as px
import plotly.graph_objects as go
# sys
import sys
import os
# directory path
AppPath = sys.path[0]
data_folder = AppPath + '/data'
file = 'data.csv'
# printing the ways
print(data_folder)
print(AppPath)
# to raw data folder
os.chdir(data_folder)
# import the data
df = pd.read_csv(file, sep='~', parse_dates=True)
# converting dtypes
downcast = 'integer'
errors = 'coerce'
# to int
df['id']= pd.to_numeric(df['id'], downcast=downcast, errors=errors)
df['max_id'] = pd.to_numeric(df['max_id'], downcast=downcast, errors=errors)
df['min_id'] = pd.to_numeric(df['min_id'], downcast=downcast, errors=errors)
df['is_end'] = pd.to_numeric(df['is_end'], downcast=downcast, errors=errors)
df['is_warning'] = pd.to_numeric(df['is_warning'], downcast=downcast, errors=errors)
df['time_delta_min'] = df['time_delta_min'].astype('int64',errors='ignore')
# dt
df['date'] = pd.to_datetime(df['date'])
# float
df
D:\stat\telegram_trevoga/data D:\stat\telegram_trevoga
| id | last_end | unique_warning | index | _ | datetime | message | ttl_period | action | region | ... | month | war_day | war_week | weekday_num | weekday | is_end | is_warning | min_id | max_id | time_delta_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 350 | False | True | 1 | Message | 2022-04-21 16:47:12+00:00 | 🚨ОДЕССА И ОБЛАСТЬ — ВОЗДУШНАЯ ТРЕВОГА!\n\nВсе ... | NaN | NaN | Одеська | ... | April | 57 | 8 | 3 | Thursday | 0 | 1 | 350.0 | NaN | NaN |
| 1 | 349 | True | False | 2 | Message | 2022-04-21 14:55:54+00:00 | Отбой тревоги✅\n\nКанал тревог: https://t.me/+... | NaN | NaN | Одеська | ... | April | 57 | 8 | 3 | Thursday | 1 | 0 | NaN | 349.0 | 70.0 |
| 2 | 347 | False | True | 3 | Message | 2022-04-21 13:45:43+00:00 | 🚨ОДЕССА И ОБЛАСТЬ — ВОЗДУШНАЯ ТРЕВОГА!\n\nВсе ... | NaN | NaN | Одеська | ... | April | 57 | 8 | 3 | Thursday | 0 | 1 | 347.0 | NaN | NaN |
| 3 | 346 | True | False | 4 | Message | 2022-04-20 20:09:27+00:00 | Отбой тревоги✅ | NaN | NaN | Одеська | ... | April | 56 | 8 | 2 | Wednesday | 1 | 0 | NaN | 346.0 | 20.0 |
| 4 | 345 | False | True | 5 | Message | 2022-04-20 19:49:10+00:00 | 🚨ОДЕССА И ОБЛАСТЬ — ВОЗДУШНАЯ ТРЕВОГА!\n\nВсе ... | NaN | NaN | Одеська | ... | April | 56 | 8 | 2 | Wednesday | 0 | 1 | 345.0 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9040 | 8 | False | False | 1011 | Message | 2022-02-27 17:40:33+00:00 | ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... | NaN | NaN | Донецька | ... | February | 4 | 0 | 6 | Sunday | 0 | 1 | 4.0 | NaN | NaN |
| 9041 | 7 | False | False | 1012 | Message | 2022-02-27 17:40:30+00:00 | ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... | NaN | NaN | Донецька | ... | February | 4 | 0 | 6 | Sunday | 0 | 1 | 4.0 | NaN | NaN |
| 9042 | 6 | False | False | 1013 | Message | 2022-02-27 17:40:27+00:00 | ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... | NaN | NaN | Донецька | ... | February | 4 | 0 | 6 | Sunday | 0 | 1 | 4.0 | NaN | NaN |
| 9043 | 5 | False | False | 1014 | Message | 2022-02-27 17:40:24+00:00 | ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... | NaN | NaN | Донецька | ... | February | 4 | 0 | 6 | Sunday | 0 | 1 | 4.0 | NaN | NaN |
| 9044 | 4 | False | True | 1015 | Message | 2022-02-27 17:40:18+00:00 | ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... | NaN | NaN | Донецька | ... | February | 4 | 0 | 6 | Sunday | 0 | 1 | 4.0 | NaN | NaN |
9045 rows × 25 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9045 entries, 0 to 9044 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 9045 non-null int16 1 last_end 9045 non-null bool 2 unique_warning 9045 non-null bool 3 index 9045 non-null int64 4 _ 9045 non-null object 5 datetime 9045 non-null object 6 message 9045 non-null object 7 ttl_period 0 non-null float64 8 action 0 non-null float64 9 region 9045 non-null object 10 command_center 9045 non-null object 11 time 9045 non-null object 12 date 9045 non-null datetime64[ns] 13 day 9045 non-null int64 14 month_num 9045 non-null int64 15 month 9045 non-null object 16 war_day 9045 non-null int64 17 war_week 9045 non-null int64 18 weekday_num 9045 non-null int64 19 weekday 9045 non-null object 20 is_end 9045 non-null int8 21 is_warning 9045 non-null int8 22 min_id 4971 non-null float64 23 max_id 4074 non-null float64 24 time_delta_min 3103 non-null float64 dtypes: bool(2), datetime64[ns](1), float64(5), int16(1), int64(6), int8(2), object(8) memory usage: 1.4+ MB
print('Number of starts of warnings: ',df[df['unique_warning'] == True].shape[0])
print('Numberof ends of warnings: ', df[df['last_end'] == True].shape[0])
Number of starts of warnings: 3105 Numberof ends of warnings: 3105
# take from 1 march
df_region = df[(df['region'] == 'Київська') & (df['date']>= pd.to_datetime('3/1/2022'))]
# global var
region = 'Kyiv'
last_date = df_region['datetime'].max()
df_region
| id | last_end | unique_warning | index | _ | datetime | message | ttl_period | action | region | ... | month | war_day | war_week | weekday_num | weekday | is_end | is_warning | min_id | max_id | time_delta_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2777 | 1364 | True | False | 0 | Message | 2022-04-21 18:04:37+00:00 | Київська обл.\nВідбій тривоги - будьте обережні | NaN | NaN | Київська | ... | April | 57 | 8 | 3 | Thursday | 1 | 0 | NaN | 1364.0 | 66.0 |
| 2778 | 1363 | False | False | 1 | Message | 2022-04-21 18:02:29+00:00 | Київ\nВідбій тривоги - будьте обережні | NaN | NaN | Київська | ... | April | 57 | 8 | 3 | Thursday | 1 | 0 | NaN | 1364.0 | NaN |
| 2779 | 1362 | False | False | 2 | Message | 2022-04-21 17:00:44+00:00 | Київська обл.\nТривога - Усі в укриття | NaN | NaN | Київська | ... | April | 57 | 8 | 3 | Thursday | 0 | 1 | 1361.0 | NaN | NaN |
| 2780 | 1361 | False | True | 3 | Message | 2022-04-21 16:58:02+00:00 | Київ\nТривога - Усі в укриття | NaN | NaN | Київська | ... | April | 57 | 8 | 3 | Thursday | 0 | 1 | 1361.0 | NaN | NaN |
| 2781 | 1360 | True | False | 4 | Message | 2022-04-21 14:58:46+00:00 | Київська обл.\nВідбій тривоги - будьте обережні | NaN | NaN | Київська | ... | April | 57 | 8 | 3 | Thursday | 1 | 0 | NaN | 1360.0 | 47.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4093 | 25 | True | False | 1337 | Message | 2022-03-01 07:24:39+00:00 | Київ, відбій тривоги🙏‼️ | NaN | NaN | Київська | ... | March | 6 | 0 | 1 | Tuesday | 1 | 0 | NaN | 25.0 | 139.0 |
| 4094 | 24 | False | False | 1338 | Message | 2022-03-01 06:54:46+00:00 | ‼️Повітряна тривога, Київ! | NaN | NaN | Київська | ... | March | 6 | 0 | 1 | Tuesday | 0 | 1 | 23.0 | NaN | NaN |
| 4095 | 23 | False | True | 1339 | Message | 2022-03-01 05:05:08+00:00 | ‼️ Повітряна тривога, Київ! | NaN | NaN | Київська | ... | March | 6 | 0 | 1 | Tuesday | 0 | 1 | 23.0 | NaN | NaN |
| 4096 | 22 | True | False | 1340 | Message | 2022-03-01 00:43:09+00:00 | ❕Відбій повітряної тривоги! | NaN | NaN | Київська | ... | March | 6 | 0 | 1 | Tuesday | 1 | 0 | NaN | 22.0 | 13.0 |
| 4097 | 21 | False | True | 1341 | Message | 2022-03-01 00:29:17+00:00 | Київ, повітряна тривога! | NaN | NaN | Київська | ... | March | 6 | 0 | 1 | Tuesday | 0 | 1 | 21.0 | NaN | NaN |
1321 rows × 25 columns
last_date
'2022-04-21 18:04:37+00:00'
# selecting
warday = df_region[['month_num','month','war_day', 'unique_warning']]
# grouping
warday = warday.groupby(by=['month_num','month','war_day']).sum()
warday = warday.reset_index()
# plotting
warday_line = px.line(warday, x='war_day', y="unique_warning", color="month",
title=f'intensity of warnings by days in {region}')
warday_line.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'unique_warning', 'month', 'month_num']]
# grouping
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday']).sum()
weekday = weekday.reset_index()
# printing
weekday_bar = px.bar(weekday, x="weekday", y="unique_warning", color="month", barmode="group",
title=f'The absolute number of warnings per weekday in {region}')
weekday_bar.show()
We count the median
# selecting
weekday = df_region[['weekday_num', 'weekday', 'unique_warning', 'month', 'month_num', 'war_day']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday', 'war_day']).sum()
weekday = weekday.reset_index()
# grouping 2
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday']).median()
weekday = weekday.reset_index()
weekday
# printing
weekday_bar = px.bar(weekday, x="weekday", y="unique_warning", color="month", barmode="group",
title=f'median number of warnings per weekday in {region}')
weekday_bar.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'unique_warning', 'month', 'month_num', 'war_day']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday', 'war_day']).sum()
weekday = weekday.reset_index()
# printing
weekday_boxplot = px.box(weekday, x='month' ,y="unique_warning", color="weekday",
title=f'distribution of warnings per weekday per months in {region}')
weekday_boxplot.show()
# selecting
warday = df_region[['month','war_day', 'time_delta_min']]
# grouping
warday = warday.groupby(by=['war_day', 'month']).sum()
warday = warday.reset_index()
# plotting
warday_line = px.line(warday, x='war_day', y="time_delta_min", color="month")
warday_line.show()
# selecting
warday = df_region[['month','war_day', 'time_delta_min']].copy()
# grouping median
warday_median = warday.groupby(by=['war_day', 'month']).median()
warday_median['measurement'] = 'Median'
# grouping mean
warday_mean = warday.groupby(by=['war_day', 'month']).mean()
warday_mean['measurement'] = 'Mean'
# union
warday = pd.concat([warday_median, warday_mean])
warday = warday.reset_index()
# plotting
warday_line_mean = px.line(warday, x='war_day', y="time_delta_min", color="measurement",
title=f'median and mean duration of one warning by days of war in {region}')
warday_line_mean.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday']).median()
weekday = weekday.reset_index()
# printing
weekday_bar = px.bar(weekday, x="weekday", y="time_delta_min", color="month", barmode="group",
title=f'median time of warnings per weekday in {region}')
weekday_bar.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num', 'war_day']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday', 'war_day']).sum()
weekday = weekday.reset_index()
weekday
# printing
weekday_boxplot = px.box(weekday, x='month' ,y="time_delta_min", color="weekday",
title=f'distribution of time in warnings per weekday per months in {region}')
weekday_boxplot.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num', 'war_day']] # added warday
weekday = weekday[pd.isna(weekday['time_delta_min']) == False].sort_values(by=['month_num', 'month', 'weekday_num', 'weekday'])
# grouping 1
weekday
# printing
weekday_boxplot = px.box(weekday, x='month' ,y="time_delta_min", color="weekday",
title=f'distribution of time of 1 warning per weekday per months {region}')
weekday_boxplot.show()
# selecting
month_dist = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num', 'war_day']] # added warday
month_dist = month_dist[pd.isna(month_dist['time_delta_min']) == False].sort_values(by=['month_num', 'month'])
# printing
hist = px.histogram(month_dist, x="time_delta_min", color="month", nbins=100, marginal="box" , facet_row="month",
title=f'histogram of time of 1 warning per months in {region}')
hist.show()
# function of counting the Kaplan Meier
def kaplan_meier(df, duration_column='', event_column=''):
df = df[df[duration_column] > 0]
durations = df.sort_values(duration_column)[duration_column].unique()
# Initialise the table
columns = ['duration', 'n_at_risk', 'n_events',
'survival_probability']
km = pd.DataFrame(columns=columns, dtype=np.number)
km = km.append(pd.DataFrame([[0, df.shape[0], 0, 1]],
columns=columns))
# Calculate survival probability for each duration
for i, t in enumerate(durations):
n = np.sum(df[duration_column]>=t)
d = np.sum((df[duration_column]==t) & (df[event_column]==True))
s = (1 - d / n) * km.loc[i, 'survival_probability']
km = km.append(pd.DataFrame([[t, n, d, s]],
index=[i+1],
columns=columns))
# km = km[(pd.isna(km['survival_probability']) == False)]
return km
# seleting necessary values
# per all time
df_km = df_region[['time_delta_min', 'last_end']]
df_km = df_km[(df_km['last_end'] == True) & (df_km['time_delta_min'] > 0)]
df_km['survival_probability'] = 1
df_km['event'] = 1
df_km.index = range(0, df_km.shape[0])
# per month
df_km_march = df_region[df_region['month']=='March']
df_km_april = df_region[df_region['month']=='April']
# counting the Kaplan Meier probability
km_march = kaplan_meier(df_km_march, duration_column='time_delta_min', event_column='last_end')
km_april = kaplan_meier(df_km_april, duration_column='time_delta_min', event_column='last_end')
# creating the plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=km_march['duration'],
y=km_march['survival_probability'],
line_shape='vh', name=f'March'))
fig.add_trace(go.Scatter(x=km_april['duration'],
y=km_april['survival_probability'],
line_shape='hv', name=f'April'))
fig.add_trace(go.Scatter(x=np.array([0, km_march['duration'].max()]), y=np.array([0.5, 0.5]), name="median",
line_shape='linear', line_color='rgb(0,176,246)'))
# fig.update_layout(legend=dict(y=0.5, traceorder='reversed', font_size=16))
fig.update_layout(title=f'Kaplan Meiers probability of warnings in {region}',
xaxis_title='duration of warnings (mins)',
yaxis_title='probaility',)
fig.show()
# selecting
df_regr = df_region[['month_num', 'month', 'war_day','unique_warning', 'time_delta_min']]
# grouping 1
wardays_warnings = df_regr[['month_num', 'month',
'war_day','unique_warning']].groupby(by=['month_num', 'month', 'war_day']).sum()
# grouping 2
wardays_durations = df_regr[['month_num', 'month',
'war_day','time_delta_min']].groupby(by=['month_num', 'month', 'war_day']).sum()
# joinings
df_durations_warnings = pd.merge(left=wardays_warnings, right=wardays_durations,
left_index=True, right_index=True)
df_durations_warnings = df_durations_warnings.reset_index()
df_durations_warnings['duration of warnings per day (mins)'] = df_durations_warnings['time_delta_min']
df_durations_warnings['number of warnings per day'] = df_durations_warnings['unique_warning']
# printing the figure
fig = px.scatter(df_durations_warnings, x='number of warnings per day', y='duration of warnings per day (mins)',
color='month', title=f'scatter of durations and number of warnings in {region}')
fig.show()
Now we can see that situation in Kyiv with air warnings in April is better than was in March. It has relationship to news from North that Russian army lossed the battle of Kyiv and overdislocated to South and East of Ukraine for attacking the OUF zone. For our hopes Russians will suck our cock